{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import malaya"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2408498, 8178139)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('filtered.json') as fopen:\n",
    "    filtered = json.load(fopen)\n",
    "\n",
    "with open('filtered_notin.json') as fopen:\n",
    "    filtered_notin = json.load(fopen)\n",
    "    \n",
    "len(filtered), len(filtered_notin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from tqdm import tqdm\n",
    "\n",
    "# ind_filtered_notin, actual_filtered_notin = [], []\n",
    "# for s in tqdm(filtered_notin):\n",
    "#     if len(set(s.split()) & (ind_words)):\n",
    "#         ind_filtered_notin.append(s)\n",
    "#     else:\n",
    "#         actual_filtered_notin.append(s)\n",
    "        \n",
    "# len(ind_filtered_notin), len(actual_filtered_notin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from unidecode import unidecode\n",
    "import cleaning\n",
    "from tqdm import tqdm\n",
    "\n",
    "def preprocessing(string):\n",
    "    string = re.sub(\n",
    "        'http\\S+|www.\\S+',\n",
    "        '',\n",
    "        ' '.join(\n",
    "            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]\n",
    "        ),\n",
    "    )\n",
    "    \n",
    "    chars = ',.()!:\\'\"/;=-'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "        \n",
    "    string = re.sub(\n",
    "        u'[0-9!@#$%^&*()_\\-+{}|\\~`\\'\";:?/.>,<]',\n",
    "        ' ',\n",
    "        string,\n",
    "        flags = re.UNICODE,\n",
    "    )\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    \n",
    "    return string.lower()\n",
    "\n",
    "def loop(strings):\n",
    "    for i in tqdm(range(len(strings))):\n",
    "        strings[i] = preprocessing(strings[i])\n",
    "    return strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 150531/150531 [00:02<00:00, 50698.66it/s]\n",
      "100%|██████████| 2/2 [00:00<00:00, 9822.73it/s]013.17it/s]\n",
      " 56%|█████▌    | 84310/150531 [00:02<00:03, 21736.56it/s]]\n",
      "100%|██████████| 150531/150531 [00:03<00:00, 49839.93it/s]\n",
      "100%|██████████| 150531/150531 [00:03<00:00, 45958.26it/s]\n",
      "100%|██████████| 150531/150531 [00:03<00:00, 45151.88it/s]\n",
      " 62%|██████▏   | 92887/150531 [00:02<00:01, 41921.01it/s]]\n",
      " 19%|█▊        | 28003/150531 [00:02<00:09, 12893.72it/s]]\n",
      "100%|██████████| 150531/150531 [00:03<00:00, 38563.66it/s]\n",
      "100%|██████████| 150531/150531 [00:04<00:00, 36800.68it/s]\n",
      "100%|██████████| 150531/150531 [00:04<00:00, 37287.51it/s]\n",
      "100%|██████████| 150531/150531 [00:04<00:00, 36761.06it/s]\n",
      "100%|██████████| 150531/150531 [00:04<00:00, 34548.36it/s]\n",
      "100%|██████████| 150531/150531 [00:04<00:00, 36450.62it/s]\n",
      "100%|██████████| 150531/150531 [00:04<00:00, 33289.06it/s]\n",
      "100%|██████████| 150531/150531 [00:08<00:00, 17129.31it/s]\n",
      "100%|██████████| 150531/150531 [00:11<00:00, 12806.91it/s]\n"
     ]
    }
   ],
   "source": [
    "ind_filtered_notin = cleaning.multiprocessing(filtered, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 511133/511133 [00:08<00:00, 56795.82it/s]\n",
      "100%|██████████| 11/11 [00:00<00:00, 8984.88it/s]8.11it/s]\n",
      "100%|██████████| 511133/511133 [00:09<00:00, 54708.19it/s]\n",
      "100%|██████████| 511133/511133 [00:09<00:00, 55513.43it/s]\n",
      "100%|██████████| 511133/511133 [00:09<00:00, 56546.53it/s]\n",
      "100%|██████████| 511133/511133 [00:09<00:00, 54859.02it/s]\n",
      "100%|██████████| 511133/511133 [00:09<00:00, 55098.05it/s]\n",
      "100%|██████████| 511133/511133 [00:10<00:00, 50607.67it/s]\n",
      "100%|██████████| 511133/511133 [00:08<00:00, 57201.77it/s]\n",
      "100%|██████████| 511133/511133 [00:09<00:00, 54220.17it/s]\n",
      " 73%|███████▎  | 374280/511133 [00:08<00:02, 51509.93it/s]\n",
      " 93%|█████████▎| 476932/511133 [00:09<00:00, 53760.17it/s]\n",
      "100%|██████████| 511133/511133 [00:10<00:00, 49388.22it/s]\n",
      "100%|██████████| 511133/511133 [00:11<00:00, 46332.79it/s]\n",
      "100%|██████████| 511133/511133 [00:11<00:00, 45315.36it/s]\n",
      "100%|██████████| 511133/511133 [00:10<00:00, 47789.85it/s]\n",
      "100%|██████████| 511133/511133 [00:22<00:00, 22737.74it/s]\n"
     ]
    }
   ],
   "source": [
    "actual_filtered_notin = cleaning.multiprocessing(filtered_notin, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2408498, 8178139)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ind_filtered_notin), len(actual_filtered_notin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('english-words.json') as fopen:\n",
    "    english_words = set(json.load(fopen))\n",
    "    \n",
    "with open('malays_word.json') as fopen:\n",
    "    malays = set(json.load(fopen))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('counts_1grams-second.txt') as fopen:\n",
    "#     count = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "# with open('counts_1grams.txt') as fopen:\n",
    "#     count.extend(list(filter(None, fopen.read().split('\\n'))))\n",
    "    \n",
    "# count = set([c.split('\\t')[0] for c in count])\n",
    "# len(count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "272249"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "english_words = malaya.texts._english_words._english_words | english_words\n",
    "len(english_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "272249"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "english_minus = english_words - malays\n",
    "len(english_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "131799"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "english_minus = {i for i in english_minus if 'haha' not in i and i != 'rt' and 'yeay' not in i and\\\n",
    "                'yes' not in i and 'ooo' not in i and 'insha' not in i and 'huhu' not in i and\\\n",
    "                'insya' not in i and 'hew' not in i and 'uwuu' not in i and\\\n",
    "                 'wkwk' not in i and 'hoho' not in i and 'babi' not in i and\\\n",
    "                 'meow' not in i and 'aiii' not in i and 'alham' not in i and 'mashaa' not in i\\\n",
    "                 and i not in ['takda', 'cer']}\n",
    "\n",
    "len(english_minus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 8178139/8178139 [00:26<00:00, 305873.52it/s]\n"
     ]
    }
   ],
   "source": [
    "rojak, malays = [], []\n",
    "for s in tqdm(actual_filtered_notin):\n",
    "    if len(set(s.split()) & english_minus):\n",
    "        rojak.append(s)\n",
    "    else:\n",
    "        malays.append(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(947237, 7230902)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(rojak), len(malays)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['dia dah tua put dah nak plus dia tak start regularly kat man utd so mesti ka',\n",
       " 'guys tolong rt tweet ni sampai owner dia dapat phone ni tertinggal kat belakang teksi pakcik saya model oppo r s',\n",
       " 'jenis jenis orang stalking di media sosial pakai akun palsu pakai akun temannya sanak saudaranya handai tau',\n",
       " 'ajax spurs lah anti menstrim',\n",
       " 'lia pulang mereka semuanya pedo kecuali aku jangan mau',\n",
       " 'nice igstory harini dah tak nmpak org repost sudan meal project tu',\n",
       " 'beomgyu ngambilin confetti yang nyangkut di rambut jimin dong liat gini aja soft akutuh cha',\n",
       " 'bts at rose bowl day jadi mulai hari ini aku akan berusaha menjadi dokter strange karena cincin ini',\n",
       " 'bukan sengaja nak samakan tapi tu laaa dah ter sama kan obvious okayyy',\n",
       " 'bolehhhh hehehe']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rojak[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['memiliki sedikit iman lebih berharga dari pada memiliki segudang emas',\n",
       " 'on jadahnyaaa in sorry bad english hihuheheho',\n",
       " 'sis tak faham apa yang mungkin ini puncanya tu',\n",
       " 'sejarah susah',\n",
       " 'loop in nama dlm email pon boleh jd issue dah org email aku reply all jelaaa ade mase pulak aku nak tengok satu nama recipients',\n",
       " 'tak sakit pun tapi saja nak bau minyak freshcare sbb bau lavender',\n",
       " 'rosmah',\n",
       " 'bila kau tengah feeling lagu raya',\n",
       " 'kekasih bayangan',\n",
       " 'hidup ni jgn terlalu nk mendongak ke atas nanti jatuh padan muka kau']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malays[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ternyata kl lg sdih bisa ngasilin makanan enak',\n",
       " 'abu kampret',\n",
       " 'bapa saya suka pake oppo saya sukanya nokia kaka saya sukanya samsung yg penting punya hape aja',\n",
       " 'ngelamar kasih cincin tp kok mukanya songong ya sedih gue liatnya',\n",
       " 'caption iki nggarai uwong males nikah min ya kali manusia arep punah ngunu neg gak nikah',\n",
       " 'pertanyaannya sederhana jika kami memang dukung prabowo ngapain selama kampanye kemarin capek dukung jokowi sampa',\n",
       " 'untuk mengamankan suara partai ahmad rofiq selaku sekjen partai perindo meminta kepada seluruh caleg dan struktur',\n",
       " 'dom jakpus sih bebas mau ketemuan or shopee',\n",
       " 'bisa dapet duit ini kaga punya mobil juga kan kaya gemer gemer ini kaga',\n",
       " 'valentino rossi tidak setuju kompetisi motogp dimulai dari eropa']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ind_filtered_notin[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('social-language.json', 'w') as fopen:\n",
    "    json.dump({'rojak': rojak,\n",
    "              'malay': malays,\n",
    "              'ind': ind_filtered_notin}, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
